import sqlite3
# 获取网页数据
import requests
# 正则表达式
import re

import yaml
# 网页解析，获取数据
from bs4 import BeautifulSoup
# 保存为excel
import xlwt

with open("config.yaml", 'r') as stream:
    config = yaml.safe_load(stream)

# 正则表达式规则 *表示多个字符，？表示0个到多个
# 影片排名链接的规则
findIndex = re.compile(r'board-index.*?>(\d+).*?')
# 影片图片的链接
findImage = re.compile(r'class="board-img".*?src="(.*?)"')
# 影片片名
findTitle = re.compile(r'title="(.*?)">')
# 影片主演
findActor = re.compile(r'class="star">(.|\n)(.*)')
# 影片上映时间
findTime = re.compile(r'class="releasetime">(.*?)</p> ')
# 猫眼评分
findScore1 = re.compile(r'class="integer">(.*?)</i>')
findScore2 = re.compile(r'class="fraction">(.*?)</i>')


# 爬取网页
# 解析数据
# 保存数据

def main():
    baseurl = config['base_url']
    # 1.爬取网页
    datalist = getData(baseurl)
    datalist = cleanData(datalist)
    print(datalist)

    # 3.保存数据
    savepath = "猫眼TOP100.xls"
    saveData(datalist, savepath)
    dbpath = config['database']['db_name']
    saveData2DB(datalist, dbpath)


def getData(baseUrl):
    datalist = []
    for i in range(0, 10):
        url = baseUrl + str(i * 10)
        html = askUrl(url)
        # 解析数据
        soup = BeautifulSoup(html, "html.parser")

        for item in soup.find_all("dd"):
            # 测试
            # print(item)
            data = []
            item = str(item)
            # 排名
            index = re.findall(findIndex, item)[0]
            data.append(index)
            # 图片地址
            image = re.findall(findImage, item)[0]
            data.append(image)
            # 标题
            title = re.findall(findTitle, item)[0]
            data.append(title)
            # 作者
            actor = re.findall(findActor, item)[0]
            actorList = list(actor)
            for i in actorList:
                actorNew = "".join(i).strip()
            data.append(actorNew)
            # 上映时间
            time = re.findall(findTime, item)[0]
            data.append(time)
            # 分数
            score1 = re.findall(findScore1, item)[0]
            # data.append(score1)
            score2 = re.findall(findScore2, item)[0]
            # data.append(score2)
            score = score1 + score2
            data.append(score)

            # print(data)
            datalist.append(data)
    # print(datalist)
    return datalist


# 爬取网页
def askUrl(url):
    # 模拟浏览器头部消息，向猫眼浏览器发送消息
    headers = {
        "Accept": "* / *",
        "Cookie": config['Cookie'],
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"}
    html = ""
    try:
        response = requests.get(url, headers=headers)
        html = response.content.decode("utf-8")
        # print(html)
    except requests.exceptions.RequestException as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)

    return html

#数据清洗
def cleanData(datalist):
    cleaned_data = []
    for item in datalist:
        item[3] = item[3][3:]
        item[4] = item[4][5:9]
        cleaned_data.append(item)
    return cleaned_data


# 保存数据到excel中
def saveData(datalist, savepath):
    # 创建book对象
    book = xlwt.Workbook(encoding="utf-8")
    # 创建工作表
    sheet = book.add_sheet("猫眼TOP100", cell_overwrite_ok=True)
    col = ("电影排名", "图片地址", "电影名称", "演出人员", "上映时间", "电影评分")
    for i in range(0, 6):
        sheet.write(0, i, col[i])
        # col[i]代表列名
    for i in range(0, 100):
        print("第%d条" % (i + 1))
        try:
            data = datalist[i]
        except:
            continue
        for j in range(0, 6):
            # 向表中写入数据
            sheet.write(i + 1, j, data[j])
    book.save(savepath)


# 保存数据到数据库中
def saveData2DB(cleaned_data, dbpath):
    init_db(dbpath)
    conn = sqlite3.connect(dbpath)
    cur = conn.cursor()
    table_name = config['database']['table_name']

    for data in cleaned_data:
        for index in range(len(data)):
            if index == 4:
                data[index] = '"' + str(data[index]) + '"'
                continue
            data[index] = '"' + data[index] + '"'

        sql = '''
                        insert into {}(
                        movie_rank,image_link,name,actor,time,score
                        )
                        values(%s)'''.format(table_name) % ",".join(data)
        # values(%s)'''%",". join('%s' %a for a in data)
        print(sql)
        cur.execute(sql)
        conn.commit()
    cur.close()
    conn.close()


def init_db(dbpath):
    # 创建数据表
    table_name = config['database']['table_name']
    sql = """
            create table {}
            (
                movie_rank integer ,
                image_link text ,
                name varchar,
                actor varchar,
                time text,
                score numeric


            )
            """.format(table_name)
    print("sql: {}".format(sql))
    conn = sqlite3.connect(dbpath)
    # 获取游标
    cursor = conn.cursor()
    # 执行sql语句
    cursor.execute(sql)
    conn.commit()
    # 关闭数据库
    conn.close()


if __name__ == '__main__':  # 当程序执行时调用函数
    main()
    # init_db("movietest.db")
    print('爬取完成')







